Dual CRISPR Screen Analysis

Construct Counting

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_timestamp = ''
g_num_processors = 3
g_filtered_fastqs_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160504_D00611_0275_AHMM2JBCXX'
g_count_alg_name = '19mer_1mm_py'
g_constructs_fp = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/raw/general/Cancer_rep2_CRV4.txt'
g_col_indices_str = "1,3,5"
g_len_of_seq_to_match = 19
g_num_allowed_mismatches = 1
g_fastq_counts_dir = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/data/interim/20160504_D00611_0275_AHMM2JBCXX'
g_fastq_counts_run_prefix = '20160504_CRV4_plasmid_20160701134823'
g_code_location = '/Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python'

CCBB Library Imports


In [ ]:
import sys
sys.path.append(g_code_location)

Automated Set-Up


In [ ]:
# %load -s describe_var_list /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/utilities/analysis_run_prefixes.py
def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)

In [ ]:
from ccbbucsd.utilities.analysis_run_prefixes import check_or_set, get_run_prefix, get_timestamp
g_timestamp = check_or_set(g_timestamp, get_timestamp())
g_fastq_counts_dir = check_or_set(g_fastq_counts_dir, g_filtered_fastqs_dir)
g_fastq_counts_run_prefix = check_or_set(g_fastq_counts_run_prefix, 
                                         get_run_prefix(None, g_count_alg_name, g_timestamp))
print(describe_var_list(['g_timestamp','g_fastq_counts_dir', 'g_fastq_counts_run_prefix']))

In [ ]:
from ccbbucsd.utilities.files_and_paths import verify_or_make_dir
verify_or_make_dir(g_fastq_counts_dir)

Info Logging Pass-Through


In [ ]:
from ccbbucsd.utilities.notebook_logging import set_stdout_info_logger
set_stdout_info_logger()

Construct Counting Functions


In [ ]:
# %load -s get_filtered_file_suffix /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/count_filterer.py
def get_filtered_file_suffix():
    return "_len_filtered.fastq"

In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/construct_file_extracter.py
# third-party libraries
import pandas

# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import trim_seq

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "prototype"

_CONSTRUCT_ID = "CONSTRUCT_ID"
_GENE_A_SEQ = "GENE_A_SEQ"
_GENE_B_SEQ = "GENE_B_SEQ"
_CONSTRUCT_A_NAME = "CONSTRUCT_A_NAME"
_CONSTRUCT_B_NAME = "CONSTRUCT_B_NAME"


def get_construct_separator():
    return "__"


def extract_construct_and_grna_info(constructs_fp, column_indices):
    construct_table = _read_in_construct_table(constructs_fp, column_indices, rows_to_skip=1)
    seq_name_sets = _extract_grnas_from_construct_table(construct_table)
    grna_name_seq_pairs = _format_and_check_grnas_input(seq_name_sets)
    construct_names = construct_table[_CONSTRUCT_ID].unique().tolist()
    return construct_names, grna_name_seq_pairs


def _read_in_construct_table(constructs_fp, column_indices, rows_to_skip=1):
    result = pandas.read_table(constructs_fp, skiprows=rows_to_skip, header=None)
    result = _rename_columns(result, column_indices)
    return result


def _rename_columns(construct_table, column_indices):
    new_names = [_CONSTRUCT_ID, _GENE_A_SEQ, _GENE_B_SEQ]
    existing_names = list(construct_table.columns.values)

    existing_to_new_names = {}
    for curr_index in range(0, len(column_indices)):
        curr_col_index = column_indices[curr_index]
        curr_existing_name = existing_names[curr_col_index]
        existing_to_new_names[curr_existing_name] = new_names[curr_index]

    return construct_table.rename(columns=existing_to_new_names)


def _extract_grnas_from_construct_table(construct_table):
    grna_name_key = "grna_name"
    grna_seq_key = "grna_seq"

    # split the construct id in each row into two pieces--the two construct names--and put them into new columns
    construct_table[_CONSTRUCT_A_NAME], construct_table[_CONSTRUCT_B_NAME] = \
        zip(*construct_table[_CONSTRUCT_ID].str.split(get_construct_separator()).tolist())

    # get the gene/sequence pairs for each of the two genes and concatenate them
    gene_a_pairs = _extract_grna_name_and_seq_df(construct_table, _CONSTRUCT_A_NAME,
                                                 _GENE_A_SEQ, grna_name_key, grna_seq_key)
    gene_b_pairs = _extract_grna_name_and_seq_df(construct_table, _CONSTRUCT_B_NAME,
                                                 _GENE_B_SEQ, grna_name_key, grna_seq_key)
    gene_pairs = pandas.concat([gene_a_pairs, gene_b_pairs])
    gene_pairs[grna_seq_key] = gene_pairs[grna_seq_key].str.upper()  # upper-case all gRNA sequences

    # extract only the unique pairs
    grna_seq_name_groups = gene_pairs.groupby([grna_seq_key, grna_name_key]).groups
    result = [x for x in grna_seq_name_groups]
    return sorted(result)  # NB sort so that output order is predictable


def _extract_grna_name_and_seq_df(construct_table, name_key, seq_key, grna_name_key, grna_seq_key):
    name_and_seq_df = construct_table[[name_key, seq_key]]
    name_and_seq_df.rename(columns={name_key: grna_name_key, seq_key: grna_seq_key}, inplace=True)
    return name_and_seq_df


def trim_grnas(grnas_name_and_seq_list, retain_len):
    result = []
    for name_seq_tuple in grnas_name_and_seq_list:
        grna_name = name_seq_tuple[0]
        full_seq = name_seq_tuple[1]
        trimmed_seq = trim_seq(full_seq, retain_len, False)  # False = do not retain from 5p end but from 3p end
        result.append((grna_name, trimmed_seq))
    return result


def _read_in_grnas(grnas_fp):
    list_from_file = _read_grnas_input(grnas_fp)
    return _format_and_check_grnas_input(list_from_file)


def _read_grnas_input(grnas_fp, comment_prefix="#", delimiter="\t"):
    result = []
    with open(grnas_fp, 'r') as file_handle:
        for line in file_handle:
            if not line.startswith(comment_prefix):
                pieces = line.strip().split(delimiter)
                result.append(pieces)
    return result


def _format_and_check_grnas_input(grnas_seq_and_name_list):
    expected_num_pieces = 2
    seqs_by_names = {}
    names_by_seqs = {}
    result = []

    for curr_set in grnas_seq_and_name_list:
        if len(curr_set) != expected_num_pieces:
            raise ValueError(
                "input '{0}' has {1} pieces instead of the expected {2}".format(
                    curr_set, len(curr_set), expected_num_pieces
                ))
        curr_seq = curr_set[0]
        curr_name = curr_set[1]

        if curr_seq in names_by_seqs:
            raise ValueError(
                "sequence '{0}' associated with name '{1}' but was already associated with name '{2}'".format(
                    curr_seq, curr_name, names_by_seqs[curr_seq]
                ))

        if curr_name in seqs_by_names:
            raise ValueError(
                "name '{0}' associated with sequence '{1}' but was already associated with sequence '{2}'".format(
                    curr_name, curr_seq, seqs_by_names[curr_name]
                ))

        names_by_seqs[curr_seq] = curr_name
        seqs_by_names[curr_name] = curr_seq
        result.append((curr_name, curr_seq))
    # next pair in

    return result

In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/grna_position_matcher.py
# ccbb libraries
from ccbbucsd.utilities.bio_seq_utilities import rev_comp_canonical_dna_seq

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"


class GrnaPositionMatcher:
    @staticmethod
    def _generate_seqs_to_check(fw_whole_seq, rv_whole_seq):
        rc_whole_rv_seq = rev_comp_canonical_dna_seq(rv_whole_seq)
        return fw_whole_seq, rc_whole_rv_seq

    def __init__(self, grna_names_and_seqs, expected_len, num_allowed_fw_mismatches, num_allowed_rv_mismatches):
        self._grna_names_and_seqs = grna_names_and_seqs
        self._num_allowed_fw_mismatches = num_allowed_fw_mismatches
        self._num_allowed_rv_mismatches = num_allowed_rv_mismatches
        self._seq_len = expected_len

    @property
    def num_allowed_fw_mismatches(self):
        return self._num_allowed_fw_mismatches

    @property
    def num_allowed_rv_mismatches(self):
        return self._num_allowed_rv_mismatches

    def find_fw_and_rv_read_matches(self, fw_whole_seq, rv_whole_seq):
        fw_construct_window, rc_rv_construct_window = self._generate_seqs_to_check(fw_whole_seq, rv_whole_seq)
        return self._id_pair_matches(fw_construct_window, rc_rv_construct_window)

    def _id_pair_matches(self, input1_seq, input2_seq):
        input2_match_name = None
        input1_match_name = self._id_sequence_match(self.num_allowed_fw_mismatches, input1_seq)
        if input1_match_name is not None:
            input2_match_name = self._id_sequence_match(self.num_allowed_rv_mismatches, input2_seq)

        return input1_match_name, input2_match_name

    def _id_sequence_match(self, num_allowed_mismatches, input_seq):
        found_name = None

        # check for perfect matches
        for potential_name, potential_reference in self._grna_names_and_seqs:
            if input_seq == potential_reference:
                found_name = potential_name
                break

        # if no perfect matches found, check for mismatches
        if found_name is None:
            min_found_mismatches = num_allowed_mismatches + 1  # nothing checked yet so num mismatches is maximum
            found_name = None
            for potential_name, potential_reference in self._grna_names_and_seqs:

                # this way is not elegant, but it IS fast :) .... >3x faster than XOR approach
                num_mismatches = 0
                for x in range(0, self._seq_len):
                    if input_seq[x] != potential_reference[x]:  # NB assumption that both are the same fixed length!
                        num_mismatches += 1
                        if num_mismatches > num_allowed_mismatches:
                            break

                if num_mismatches < min_found_mismatches:
                    min_found_mismatches = num_mismatches
                    if num_mismatches <= num_allowed_mismatches:
                        found_name = potential_name

        return found_name

In [ ]:
# %load /Users/Birmingham/Repositories/ccbb_tickets/20160210_mali_crispr/src/python/ccbbucsd/malicrispr/construct_counter.py
"""This module counts almost-perfect matches of small sequences within forward and reverse fastq sequence pairs."""

# standard libraries
import csv
import datetime
import logging

# ccbb libraries
from ccbbucsd.utilities.basic_fastq import FastqHandler, paired_fastq_generator

# project-specific libraries
from ccbbucsd.malicrispr.construct_file_extracter import get_construct_separator

__author__ = "Amanda Birmingham"
__maintainer__ = "Amanda Birmingham"
__email__ = "abirmingham@ucsd.edu"
__status__ = "development"


def get_counts_file_suffix():
    return "counts.txt"


def get_counter_from_names(names_to_count):
    return {x: 0 for x in names_to_count}


def generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp):
    counts_info_tuple = _match_and_count_constructs_from_files(grna_matcher, construct_names, fw_fastq_fp, rv_fastq_fp)
    counts_by_construct = counts_info_tuple[0]
    counts_by_type = counts_info_tuple[1]
    _write_counts(counts_by_construct, counts_by_type, output_fp)


def _match_and_count_constructs_from_files(grna_matcher, construct_names, fw_fastq_fp, rv_fastq_fp):
    construct_counts = get_counter_from_names(construct_names)
    fw_fastq_handler = FastqHandler(fw_fastq_fp)
    rv_fastq_handler = FastqHandler(rv_fastq_fp)
    return _match_and_count_constructs(grna_matcher, construct_counts, fw_fastq_handler, rv_fastq_handler)


def _match_and_count_constructs(grna_matcher, construct_counts, fw_fastq_handler, rv_fastq_handler):
    summary_counts = {"num_pairs": 0, "num_pairs_unrecognized": 0, "num_constructs_found": 0,
                      "num_constructs_unrecognized": 0, "num_constructs_recognized": 0}

    paired_fastq_seqs = paired_fastq_generator(fw_fastq_handler, rv_fastq_handler)
    for curr_pair_seqs in paired_fastq_seqs:
        summary_counts["num_pairs"] += 1
        _report_progress(summary_counts["num_pairs"])

        grna_name_A, grna_name_B = grna_matcher.find_fw_and_rv_read_matches(*curr_pair_seqs)
        if grna_name_A is not None and grna_name_B is not None:
            construct_name = _generate_construct_name(grna_name_A, grna_name_B)
            summary_counts["num_constructs_found"] += 1

            if construct_name in construct_counts:
                summary_counts["num_constructs_recognized"] += 1
                construct_counts[construct_name] += 1
            else:
                summary_counts["num_constructs_unrecognized"] += 1
                logging.debug("Unrecognized construct name: {0}".format(construct_name))
        else:
            summary_counts["num_pairs_unrecognized"] += 1
            logging.debug("Unrecognized sequence: {0},{1}".format(*curr_pair_seqs))

    return construct_counts, summary_counts


def _report_progress(num_fastq_pairs):
    if num_fastq_pairs % 100000 == 0:
        logging.info("On fastq pair number {0} at {1}".format(num_fastq_pairs, datetime.datetime.now()))


def _generate_construct_name(grna_name_A, grna_name_B):
    return "{0}{1}{2}".format(grna_name_A, get_construct_separator(), grna_name_B)


def _write_counts(counts_by_construct, counts_by_type, output_fp):
    construct_names = list(counts_by_construct.keys())
    construct_names.sort()

    with open(output_fp, 'w') as file_handle:
        summary_pieces = []
        for curr_key, curr_value in counts_by_type.items():
            summary_pieces.append("{0}:{1}".format(curr_key, curr_value))
        summary_comment = ",".join(summary_pieces)
        summary_comment = "# " + summary_comment
        header = ["construct_id", "counts"]
        writer = csv.writer(file_handle, delimiter="\t")

        writer.writerow([summary_comment])
        writer.writerow(header)
        for curr_name in construct_names:
            row = [curr_name, counts_by_construct[curr_name]]
            writer.writerow(row)

In [ ]:
from ccbbucsd.utilities.files_and_paths import build_multipart_fp

def count_constructs_for_one_fastq_pair(curr_base, run_prefix, seq_len, num_allowed_mismatches, constructs_fp, 
                                        col_indices, output_dir, fw_fastq_fp, rv_fastq_fp):
    construct_names, grna_name_seq_pairs = extract_construct_and_grna_info(constructs_fp, col_indices)
    trimmed_grna_name_seq_pairs = trim_grnas(grna_name_seq_pairs, seq_len)
    # Note: currently same value (num_allowed_mismatches) is being used for number of mismatches allowed in forward
    # read and number of mismatches allowed in reverse read, but this can be altered if desired
    grna_matcher = GrnaPositionMatcher(trimmed_grna_name_seq_pairs, seq_len, num_allowed_mismatches, 
                                       num_allowed_mismatches)    
    output_fp = build_multipart_fp(output_dir, [curr_base, run_prefix, get_counts_file_suffix()])
    generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp)

In [ ]:
from ccbbucsd.utilities.parallel_process_fastqs import parallel_process_paired_reads, concatenate_parallel_results

g_col_indices = [int(x.strip()) for x in g_col_indices_str.split(",")]
g_parallel_results = parallel_process_paired_reads(g_filtered_fastqs_dir, get_filtered_file_suffix(), g_num_processors, 
                              count_constructs_for_one_fastq_pair, [g_fastq_counts_run_prefix, g_len_of_seq_to_match,
                                                                    g_num_allowed_mismatches, g_constructs_fp, 
                                                                    g_col_indices, g_fastq_counts_dir], True)

In [ ]:
print(concatenate_parallel_results(g_parallel_results))

In [ ]: